Data Vizualization Project

Analysis and Prediction of Indian General Election

Shivam Baranwal 19BCE1350

Arya Adarsh 19BCE1556

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
In [2]:
df = pd.read_csv("LS_2.0 (1).csv")
In [3]:
pd.set_option('display.max_rows',20000, 'display.max_columns',100)
In [4]:
df.shape
Out[4]:
(2263, 19)
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2263 entries, 0 to 2262
Data columns (total 19 columns):
STATE                                       2263 non-null object
CONSTITUENCY                                2263 non-null object
NAME                                        2263 non-null object
WINNER                                      2263 non-null int64
PARTY                                       2263 non-null object
SYMBOL                                      2018 non-null object
GENDER                                      2018 non-null object
CRIMINAL
CASES                              2018 non-null object
AGE                                         2018 non-null float64
CATEGORY                                    2018 non-null object
EDUCATION                                   2018 non-null object
ASSETS                                      2018 non-null object
LIABILITIES                                 2018 non-null object
GENERAL
VOTES                               2263 non-null int64
POSTAL
VOTES                                2263 non-null int64
TOTAL
VOTES                                 2263 non-null int64
OVER TOTAL ELECTORS 
IN CONSTITUENCY        2263 non-null float64
OVER TOTAL VOTES POLLED 
IN CONSTITUENCY    2263 non-null float64
TOTAL ELECTORS                              2263 non-null int64
dtypes: float64(3), int64(5), object(11)
memory usage: 336.0+ KB
In [6]:
df.head(5)
Out[6]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS
0 Telangana ADILABAD SOYAM BAPU RAO 1 BJP Lotus MALE 52 52.0 ST 12th Pass Rs 30,99,414\n ~ 30 Lacs+ Rs 2,31,450\n ~ 2 Lacs+ 376892 482 377374 25.330684 35.468248 1489790
1 Telangana ADILABAD Godam Nagesh 0 TRS Car MALE 0 54.0 ST Post Graduate Rs 1,84,77,888\n ~ 1 Crore+ Rs 8,47,000\n ~ 8 Lacs+ 318665 149 318814 21.399929 29.964370 1489790
2 Telangana ADILABAD RATHOD RAMESH 0 INC Hand MALE 3 52.0 ST 12th Pass Rs 3,64,91,000\n ~ 3 Crore+ Rs 1,53,00,000\n ~ 1 Crore+ 314057 181 314238 21.092771 29.534285 1489790
3 Telangana ADILABAD NOTA 0 NOTA NaN NaN NaN NaN NaN NaN NaN NaN 13030 6 13036 0.875023 1.225214 1489790
4 Uttar Pradesh AGRA Satyapal Singh Baghel 1 BJP Lotus MALE 5 58.0 SC Doctorate Rs 7,42,74,036\n ~ 7 Crore+ Rs 86,06,522\n ~ 86 Lacs+ 644459 2416 646875 33.383823 56.464615 1937690
In [7]:
df.isnull().sum().sort_values(ascending=False).head(30)
Out[7]:
CATEGORY                                     245
GENDER                                       245
LIABILITIES                                  245
ASSETS                                       245
EDUCATION                                    245
SYMBOL                                       245
AGE                                          245
CRIMINAL\nCASES                              245
CONSTITUENCY                                   0
NAME                                           0
WINNER                                         0
PARTY                                          0
TOTAL ELECTORS                                 0
OVER TOTAL VOTES POLLED \nIN CONSTITUENCY      0
GENERAL\nVOTES                                 0
POSTAL\nVOTES                                  0
TOTAL\nVOTES                                   0
OVER TOTAL ELECTORS \nIN CONSTITUENCY          0
STATE                                          0
dtype: int64
In [8]:
df[df['NAME']=='NOTA'].head()
Out[8]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS
3 Telangana ADILABAD NOTA 0 NOTA NaN NaN NaN NaN NaN NaN NaN NaN 13030 6 13036 0.875023 1.225214 1489790
14 Gujarat AHMEDABAD WEST NOTA 0 NOTA NaN NaN NaN NaN NaN NaN NaN NaN 14580 139 14719 0.895688 1.473030 1643317
39 West Bengal ALIPURDUARS NOTA 0 NOTA NaN NaN NaN NaN NaN NaN NaN NaN 21147 28 21175 1.284592 1.533114 1648383
46 Uttarakhand ALMORA NOTA 0 NOTA NaN NaN NaN NaN NaN NaN NaN NaN 15311 194 15505 1.158985 2.215611 1337808
54 Andhra Pradesh AMALAPURAM NOTA 0 NOTA NaN NaN NaN NaN NaN NaN NaN NaN 16427 41 16468 1.128288 1.333044 1459556
In [9]:
df = df.fillna({'SYMBOL':'NO SYMBOL',
                'GENDER':'NOT APPLICABLE',
                'CRIMINAL\nCASES':0,'AGE':0.0,
                'CATEGORY':'NOT APPLICABLE',
                'EDUCATION':'NOT APPLICABLE',
                'ASSETS':'Rs 0',
                'LIABILITIES':'Rs 0',})
In [10]:
df.isnull().sum().sort_values(ascending=False).head(30)
Out[10]:
TOTAL ELECTORS                               0
AGE                                          0
CONSTITUENCY                                 0
NAME                                         0
WINNER                                       0
PARTY                                        0
SYMBOL                                       0
GENDER                                       0
CRIMINAL\nCASES                              0
CATEGORY                                     0
OVER TOTAL VOTES POLLED \nIN CONSTITUENCY    0
EDUCATION                                    0
ASSETS                                       0
LIABILITIES                                  0
GENERAL\nVOTES                               0
POSTAL\nVOTES                                0
TOTAL\nVOTES                                 0
OVER TOTAL ELECTORS \nIN CONSTITUENCY        0
STATE                                        0
dtype: int64
In [11]:
(df['PARTY'].value_counts()/len(df['PARTY']))*100
Out[11]:
BJP           18.559434
INC           18.250110
NOTA          10.826337
IND            8.882015
BSP            7.202828
CPI(M)         4.418913
AITC           2.076889
VBA            2.076889
SP             1.723376
NTK            1.679187
MNM            1.590809
SHS            1.148917
TDP            1.104728
AAP            1.104728
YSRCP          1.104728
DMK            1.016350
NCP            0.927972
BJD            0.927972
RJD            0.927972
AIADMK         0.927972
JnP            0.883783
JD(U)          0.795404
TRS            0.751215
SBSP           0.441891
SAD            0.441891
APoI           0.397702
JD(S)          0.397702
PMK            0.309324
LJP            0.265135
CPI(ML)(L)     0.265135
JNJP           0.265135
JMM            0.220946
BLSP           0.220946
BMUP           0.220946
GGP            0.220946
RSP            0.220946
NPEP           0.220946
BDJS           0.176757
SDPI           0.176757
DMDK           0.176757
BTP            0.176757
INLD           0.176757
AIFB           0.132567
RAHIS          0.132567
RSPSR          0.132567
IUML           0.132567
VSIP           0.132567
PSPL           0.132567
SSD            0.132567
JKN            0.132567
AIUDF          0.132567
PUNEKP         0.132567
HAMS           0.132567
LIP            0.132567
AIMIM          0.132567
RLD            0.132567
AGP            0.132567
PPID           0.132567
JPC            0.132567
NEINDP         0.088378
JVM            0.088378
PPA            0.088378
JKPDP          0.088378
IPFT           0.088378
ADAL           0.088378
JKP            0.088378
AAM            0.088378
JDL            0.088378
RVNP           0.088378
LTSP           0.088378
UPPL           0.088378
SWP            0.088378
DSSP           0.044189
SKM            0.044189
JDR            0.044189
SDF            0.044189
RLTP           0.044189
SUCI(C)        0.044189
RTORP          0.044189
RSOSP          0.044189
RMPOI          0.044189
AIPF           0.044189
BVA            0.044189
BARESP         0.044189
BJKVP          0.044189
MOSP           0.044189
JANADIP        0.044189
BSCP           0.044189
PDP            0.044189
VCSMP          0.044189
AKBMP          0.044189
ravp           0.044189
VPI            0.044189
BRPI           0.044189
AHFBK          0.044189
ANC            0.044189
WAP            0.044189
BOPF           0.044189
AJSUP          0.044189
ABGP           0.044189
SJDD           0.044189
KEC(M)         0.044189
PMP            0.044189
JHP            0.044189
NDPP           0.044189
CPIM           0.044189
NAWPP          0.044189
UDP            0.044189
MSHP           0.044189
TMC(M)         0.044189
BLSD           0.044189
PHJSP          0.044189
WPOI           0.044189
VCK            0.044189
JAPL           0.044189
SPL            0.044189
TJS            0.044189
YKP            0.044189
SAD(M)         0.044189
KEC            0.044189
MADP           0.044189
AINRC          0.044189
ABSKP          0.044189
AHNP           0.044189
PRISMP         0.044189
BNDl           0.044189
BLRP           0.044189
NPF            0.044189
MNF            0.044189
AJPR           0.044189
ASDC           0.044189
JKNPP          0.044189
BBMP           0.044189
Name: PARTY, dtype: float64
In [12]:
df.head()
Out[12]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS
0 Telangana ADILABAD SOYAM BAPU RAO 1 BJP Lotus MALE 52 52.0 ST 12th Pass Rs 30,99,414\n ~ 30 Lacs+ Rs 2,31,450\n ~ 2 Lacs+ 376892 482 377374 25.330684 35.468248 1489790
1 Telangana ADILABAD Godam Nagesh 0 TRS Car MALE 0 54.0 ST Post Graduate Rs 1,84,77,888\n ~ 1 Crore+ Rs 8,47,000\n ~ 8 Lacs+ 318665 149 318814 21.399929 29.964370 1489790
2 Telangana ADILABAD RATHOD RAMESH 0 INC Hand MALE 3 52.0 ST 12th Pass Rs 3,64,91,000\n ~ 3 Crore+ Rs 1,53,00,000\n ~ 1 Crore+ 314057 181 314238 21.092771 29.534285 1489790
3 Telangana ADILABAD NOTA 0 NOTA NO SYMBOL NOT APPLICABLE 0 0.0 NOT APPLICABLE NOT APPLICABLE Rs 0 Rs 0 13030 6 13036 0.875023 1.225214 1489790
4 Uttar Pradesh AGRA Satyapal Singh Baghel 1 BJP Lotus MALE 5 58.0 SC Doctorate Rs 7,42,74,036\n ~ 7 Crore+ Rs 86,06,522\n ~ 86 Lacs+ 644459 2416 646875 33.383823 56.464615 1937690
In [13]:
(df['WINNER'].value_counts()/len(df['WINNER']))*100
Out[13]:
0    76.182059
1    23.817941
Name: WINNER, dtype: float64
In [14]:
plt.rcParams['figure.figsize'] = (12,8)
labels = 'LOST', 'WON'
sizes = [76,24]
explode = (0, 0.1)  

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%0.0f%%',
        shadow=True, startangle=90,center=(0, 0))
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('2019 GENERAL ELECTION WINNING-LOSING PERCENTAGE')
plt.show()
In [15]:
plt.figure(figsize=(10,8))
sns.countplot(y='EDUCATION',hue='WINNER',data=df,palette="GnBu")
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x2354ec739b0>
In [16]:
plt.figure(figsize=(10,8))
sns.countplot(x='WINNER',hue='GENDER',data=df)
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x2354f01c6d8>
In [17]:
#Filling Nan Values in CRIMINAL CASE COLUMN
df['CRIMINAL\nCASES']=df['CRIMINAL\nCASES'].replace(to_replace ="Not Available", 
                 value ="0")
df['CRIMINAL\nCASES'] = df['CRIMINAL\nCASES'].astype(int)
In [18]:
df.head()
Out[18]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS
0 Telangana ADILABAD SOYAM BAPU RAO 1 BJP Lotus MALE 52 52.0 ST 12th Pass Rs 30,99,414\n ~ 30 Lacs+ Rs 2,31,450\n ~ 2 Lacs+ 376892 482 377374 25.330684 35.468248 1489790
1 Telangana ADILABAD Godam Nagesh 0 TRS Car MALE 0 54.0 ST Post Graduate Rs 1,84,77,888\n ~ 1 Crore+ Rs 8,47,000\n ~ 8 Lacs+ 318665 149 318814 21.399929 29.964370 1489790
2 Telangana ADILABAD RATHOD RAMESH 0 INC Hand MALE 3 52.0 ST 12th Pass Rs 3,64,91,000\n ~ 3 Crore+ Rs 1,53,00,000\n ~ 1 Crore+ 314057 181 314238 21.092771 29.534285 1489790
3 Telangana ADILABAD NOTA 0 NOTA NO SYMBOL NOT APPLICABLE 0 0.0 NOT APPLICABLE NOT APPLICABLE Rs 0 Rs 0 13030 6 13036 0.875023 1.225214 1489790
4 Uttar Pradesh AGRA Satyapal Singh Baghel 1 BJP Lotus MALE 5 58.0 SC Doctorate Rs 7,42,74,036\n ~ 7 Crore+ Rs 86,06,522\n ~ 86 Lacs+ 644459 2416 646875 33.383823 56.464615 1937690
In [19]:
#Checking if the candidate is criminal/ncase or not
for i in range(len(df)):
    if df.iloc[i,7]>0:
        df.iloc[i,7]='HAS CASE'
    else:
        df.iloc[i,7]='NO CASE'
In [20]:
plt.figure(figsize=(8,4))
sns.countplot(x='WINNER',hue='CRIMINAL\nCASES',data=df)
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x2354f114fd0>
In [21]:
sns.countplot(y='GENDER',hue='CRIMINAL\nCASES',data=df,palette='hot')
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x2354f1d0b00>
In [22]:
df.head()
Out[22]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS
0 Telangana ADILABAD SOYAM BAPU RAO 1 BJP Lotus MALE HAS CASE 52.0 ST 12th Pass Rs 30,99,414\n ~ 30 Lacs+ Rs 2,31,450\n ~ 2 Lacs+ 376892 482 377374 25.330684 35.468248 1489790
1 Telangana ADILABAD Godam Nagesh 0 TRS Car MALE NO CASE 54.0 ST Post Graduate Rs 1,84,77,888\n ~ 1 Crore+ Rs 8,47,000\n ~ 8 Lacs+ 318665 149 318814 21.399929 29.964370 1489790
2 Telangana ADILABAD RATHOD RAMESH 0 INC Hand MALE HAS CASE 52.0 ST 12th Pass Rs 3,64,91,000\n ~ 3 Crore+ Rs 1,53,00,000\n ~ 1 Crore+ 314057 181 314238 21.092771 29.534285 1489790
3 Telangana ADILABAD NOTA 0 NOTA NO SYMBOL NOT APPLICABLE NO CASE 0.0 NOT APPLICABLE NOT APPLICABLE Rs 0 Rs 0 13030 6 13036 0.875023 1.225214 1489790
4 Uttar Pradesh AGRA Satyapal Singh Baghel 1 BJP Lotus MALE HAS CASE 58.0 SC Doctorate Rs 7,42,74,036\n ~ 7 Crore+ Rs 86,06,522\n ~ 86 Lacs+ 644459 2416 646875 33.383823 56.464615 1937690
In [23]:
df[df['ASSETS']=='Not Available']
Out[23]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS
468 Bihar BUXAR Ramchandra Singh Yadav 0 IND Almirah MALE NO CASE 42.0 GENERAL Not Available Not Available Not Available 10721 9 10730 0.586540 1.087175 1829373
532 Tamil Nadu CHIDAMBARAM SIVAJOTHI M 0 NTK Ganna Kisan MALE NO CASE 35.0 SC Not Available Not Available Not Available 37329 142 37471 2.531445 3.246331 1480222
612 Uttar Pradesh DEORIA BINOD KUMAR JAISWAL 0 BSP Elephant MALE NO CASE 56.0 GENERAL Not Available Not Available Not Available 327728 2985 330713 18.852693 32.563441 1754195
613 Uttar Pradesh DEORIA NIYAZ AHMED 0 INC Hand MALE NO CASE 57.0 GENERAL Not Available Not Available Not Available 50749 307 51056 2.910509 5.027196 1754195
654 Tamil Nadu DINDIGUL JOTHIMUTHU, K. 0 PMK Mango MALE NO CASE 48.0 GENERAL Not Available Not Available Not Available 206782 769 207551 13.460896 17.877979 1541881
656 Tamil Nadu DINDIGUL MANSOORALI KHAN, A. 0 NTK Ganna Kisan MALE NO CASE 56.0 GENERAL Not Available Not Available Not Available 54574 383 54957 3.564283 4.733873 1541881
688 Andhra Pradesh ELURU PENTAPATI PULLARAO 0 JnP Glass Tumbler MALE NO CASE 71.0 GENERAL Not Available Not Available Not Available 76481 346 76827 4.816891 5.766884 1594950
972 Bihar JAMUI (SC) UPENDRA RAVIDAS 0 BSP Elephant MALE NO CASE 40.0 SC Not Available Not Available Not Available 31504 107 31611 1.839117 3.329012 1718814
1033 Uttar Pradesh KAISERGANJ Vinay Kumar Pandey 'Vinnu' 0 INC Hand MALE NO CASE 54.0 GENERAL Not Available Not Available Not Available 37049 83 37132 2.056097 3.780019 1805946
1052 Tamil Nadu KALLAKURICHI Sharfudeen S 0 NTK Ganna Kisan MALE NO CASE 40.0 GENERAL Not Available Not Available Not Available 30103 143 30246 1.978236 2.510177 1528938
1126 Uttar Pradesh KAUSHAMBI SHAILENDRA KUMAR 0 IND Football MALE NO CASE 44.0 SC Not Available Not Available Not Available 26954 13 26967 1.508964 2.765741 1787120
1309 Bihar MADHUBANI BADRI KUMAR PURBEY 0 VSIP Boat with Man and Sail MALE NO CASE 46.0 GENERAL Not Available Not Available Not Available 140705 198 140903 7.859391 14.605797 1792798
1681 Kerala PONNANI P.V. Anvar Puthan Veetil 0 IND Scissors MALE NO CASE 51.0 GENERAL Not Available Not Available Not Available 328208 343 328551 24.215085 32.294278 1356803
1696 Puducherry PUDUCHERRY N.SHARMILA BEGUM 0 NTK Ganna Kisan FEMALE NO CASE 30.0 GENERAL Not Available Not Available Not Available 22842 15 22857 2.348137 2.890017 973410
1716 Bihar PURVI CHAMPARAN AAKASH KUMAR SINGH 0 BLSP Ceiling Fan MALE NO CASE 27.0 GENERAL Not Available Not Available Not Available 281500 2639 284139 17.130512 28.410065 1658672
1938 Madhya Pradesh SIDHI Dileep Kumar Shukla 0 IND Balloon MALE NO CASE 34.0 GENERAL Not Available Not Available Not Available 15555 0 15555 0.842840 1.212672 1845547
1971 Tamil Nadu SIVAGANGA V.PANDI 0 IND Gift Pack MALE NO CASE 43.0 GENERAL Not Available Not Available Not Available 122325 209 122534 7.895135 11.294435 1552019
1978 Bihar SIWAN Deva Kant Mishra Alias Munna Bhaiya 0 IND Chimney MALE NO CASE 43.0 GENERAL Not Available Not Available Not Available 36458 1 36459 2.026005 3.701985 1799551
2069 Tamil Nadu THENI THANGA TAMILSELVAN 0 IND Gift Pack MALE NO CASE 58.0 GENERAL Not Available Not Available Not Available 143173 877 144050 9.227822 12.259491 1561040
2188 Bihar VAISHALI Rinkoo Devi 0 IND Cot FEMALE NO CASE 37.0 GENERAL Not Available Not Available Not Available 16734 4 16738 0.964180 1.557384 1735983
2223 Tamil Nadu VIRUDHUNAGAR MUNIYASAMY, V. 0 MNM Battery Torch MALE NO CASE 41.0 GENERAL Not Available Not Available Not Available 56815 314 57129 3.848999 5.309752 1484256
2249 Kerala WAYANAD Thushar Vellappally 0 BDJS Pot MALE NO CASE 50.0 GENERAL Not Available Not Available Not Available 78590 226 78816 5.796662 7.212569 1359679
In [24]:
for i in range(len(df)):
    if df.iloc[i,11]=='Not Available':
        df.iloc[i,11] = "-1"    
In [25]:
#Removing '\n ~' from ASSETS
df['ASSETS'] = df['ASSETS'].str.split('\n ~', 1, expand=True)[0]
#Removing 'Rs' from ASSETS
df['ASSETS'] = df['ASSETS'].str.split(expand=True)[1]
#Removing special charater "," and then joinig it
df['ASSETS']=df['ASSETS'].str.split(",").str.join(" ")
#Removing the space between the join
df['ASSETS']=df['ASSETS'].str.replace(' ', '')
#Converting ASSETS column into flaot as int values are very high
df['ASSETS'] = df['ASSETS'].astype(float)
In [26]:
df['ASSETS'] = df['ASSETS'].fillna(value = 0.0)
In [27]:
STATUS = []
for i in df['ASSETS']:
    if i >0.0 and i < 500000.0:
        STATUS.append('NEAR TO BPL')
    if i >= 500000.0 and i <= 1000000.0:
        STATUS.append('LOWER CLASS')
    if i >= 1000001.0 and i <= 2500000.0:
        STATUS.append('LOWER MIDDLE CLASS')
    elif i >= 2500001.0 and i <= 10000000.0:
        STATUS.append('MIDDLE CLASS')
    elif i >= 10000001.0 and i <= 100000000.0:
        STATUS.append('UPPER MIDDLE CLASS')
    elif i >= 100000000.0 and i <= 250000000.0:
        STATUS.append('ELITE CLASS')
    elif i >= 250000001.0 and i <= 1000000000.0:
        STATUS.append('SUPER RICH')
    elif i >= 1000000001.0:
        STATUS.append('RICHEST OF RICH')
    elif i == -1.0:
        STATUS.append('ASSETS NOT MENTIONED')
    elif i == 0.0:
        STATUS.append('NO ASSETS')
df['STATUS'] = STATUS
In [28]:
sns.countplot(y='STATUS',hue='CRIMINAL\nCASES',data=df,palette='hot')
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x2354f4bbdd8>
In [29]:
print(df['PARTY'].nunique(),df['PARTY'].unique())
133 ['BJP' 'TRS' 'INC' 'NOTA' 'BSP' 'NCP' 'VBA' 'APoI' 'CPI(M)' 'BDJS' 'AITC'
 'RSP' 'SP' 'YSRCP' 'TDP' 'JnP' 'INLD' 'SBSP' 'IND' 'SHS' 'AAP' 'SAD'
 'JKN' 'JKPDP' 'JPC' 'DMK' 'PMK' 'NTK' 'MNM' 'AIADMK' 'RJD' 'CPI(ML)(L)'
 'SSD' 'PPA' 'JD(S)' 'NPEP' 'BMUP' 'BJD' 'AIMIM' 'HAMS' 'AHFBK' 'PPID'
 'SPL' 'ASDC' 'RLD' 'PSPL' 'JD(U)' 'BTP' 'AIFB' 'AGP' 'AIUDF' 'ABSKP'
 'PUNEKP' 'RTORP' 'JNJP' 'LTSP' 'RVNP' 'JANADIP' 'SDPI' 'DMDK' 'ABGP'
 'VCK' 'JMM' 'LIP' 'JDR' 'MOSP' 'MADP' 'AJPR' 'PMP' 'BBMP' 'AJSUP' 'JVM'
 'RMPOI' 'LJP' 'BJKVP' 'SWP' 'NEINDP' 'RSPSR' 'ravp' 'RSOSP' 'BLSP' 'WPOI'
 'SUCI(C)' 'SJDD' 'ANC' 'JDL' 'VSIP' 'AAM' 'JKP' 'BOPF' 'UPPL' 'CPIM'
 'GGP' 'KEC(M)' 'KEC' 'JAPL' 'AKBMP' 'TJS' 'IUML' 'BSCP' 'ADAL' 'BRPI'
 'MNF' 'PRISMP' 'VPI' 'YKP' 'NDPP' 'RLTP' 'RAHIS' 'NPF' 'BLSD' 'BVA'
 'NAWPP' 'AINRC' 'BNDl' 'MSHP' 'BARESP' 'BLRP' 'AIPF' 'WAP' 'VCSMP'
 'SAD(M)' 'UDP' 'SKM' 'SDF' 'PDP' 'JHP' 'TMC(M)' 'IPFT' 'JKNPP' 'DSSP'
 'AHNP' 'PHJSP']
In [30]:
#Using Groupby on Party with Total Votes(TO CHECK HIGHEST PERCENTAGE OF VOTES FOR TOP 10 PARTIES)
p = df.groupby('PARTY')['TOTAL\nVOTES']
X = ((p.sum()/df['TOTAL\nVOTES'].sum())*100).sort_values(ascending = False).head(10)
X
Out[30]:
PARTY
BJP       38.526233
INC       20.096019
AITC       4.178796
BSP        3.501644
SP         2.627939
YSRCP      2.614598
CPI(M)     2.386397
DMK        2.335416
SHS        2.123538
TDP        2.106107
Name: TOTAL\nVOTES, dtype: float64
In [31]:
plt.rcParams['figure.figsize'] = (12,8)
labels = 'BJP', 'INC','AITC','BSP','SP','YSRCP','CPI(M)','DMK','SHS','TDP','OTHERS'
sizes = [38,20,4,3,2,2,2,2,2,2,23]
 
colors=('orange', 'green', 'deeppink', 'blue', 'red', 'yellow', 'crimson', 'brown','darkorange','pink','gray')

my_circle = plt.Circle((0, 0), 0.7, color='white')

d = plt.pie(sizes, labels=labels, autopct='%0.0f%%',
            startangle=90,colors=colors, labeldistance=1.05)
plt.axis('equal')
plt.gca().add_artist(my_circle)
plt.title('VOTE SHARE OF PARTY')            
plt.show()
In [32]:
df.head()
Out[32]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS STATUS
0 Telangana ADILABAD SOYAM BAPU RAO 1 BJP Lotus MALE HAS CASE 52.0 ST 12th Pass 3099414.0 Rs 2,31,450\n ~ 2 Lacs+ 376892 482 377374 25.330684 35.468248 1489790 MIDDLE CLASS
1 Telangana ADILABAD Godam Nagesh 0 TRS Car MALE NO CASE 54.0 ST Post Graduate 18477888.0 Rs 8,47,000\n ~ 8 Lacs+ 318665 149 318814 21.399929 29.964370 1489790 UPPER MIDDLE CLASS
2 Telangana ADILABAD RATHOD RAMESH 0 INC Hand MALE HAS CASE 52.0 ST 12th Pass 36491000.0 Rs 1,53,00,000\n ~ 1 Crore+ 314057 181 314238 21.092771 29.534285 1489790 UPPER MIDDLE CLASS
3 Telangana ADILABAD NOTA 0 NOTA NO SYMBOL NOT APPLICABLE NO CASE 0.0 NOT APPLICABLE NOT APPLICABLE 0.0 Rs 0 13030 6 13036 0.875023 1.225214 1489790 NO ASSETS
4 Uttar Pradesh AGRA Satyapal Singh Baghel 1 BJP Lotus MALE HAS CASE 58.0 SC Doctorate 74274036.0 Rs 86,06,522\n ~ 86 Lacs+ 644459 2416 646875 33.383823 56.464615 1937690 UPPER MIDDLE CLASS
In [33]:
df['AGE'].nunique()
Out[33]:
61
In [34]:
df.head()
Out[34]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS STATUS
0 Telangana ADILABAD SOYAM BAPU RAO 1 BJP Lotus MALE HAS CASE 52.0 ST 12th Pass 3099414.0 Rs 2,31,450\n ~ 2 Lacs+ 376892 482 377374 25.330684 35.468248 1489790 MIDDLE CLASS
1 Telangana ADILABAD Godam Nagesh 0 TRS Car MALE NO CASE 54.0 ST Post Graduate 18477888.0 Rs 8,47,000\n ~ 8 Lacs+ 318665 149 318814 21.399929 29.964370 1489790 UPPER MIDDLE CLASS
2 Telangana ADILABAD RATHOD RAMESH 0 INC Hand MALE HAS CASE 52.0 ST 12th Pass 36491000.0 Rs 1,53,00,000\n ~ 1 Crore+ 314057 181 314238 21.092771 29.534285 1489790 UPPER MIDDLE CLASS
3 Telangana ADILABAD NOTA 0 NOTA NO SYMBOL NOT APPLICABLE NO CASE 0.0 NOT APPLICABLE NOT APPLICABLE 0.0 Rs 0 13030 6 13036 0.875023 1.225214 1489790 NO ASSETS
4 Uttar Pradesh AGRA Satyapal Singh Baghel 1 BJP Lotus MALE HAS CASE 58.0 SC Doctorate 74274036.0 Rs 86,06,522\n ~ 86 Lacs+ 644459 2416 646875 33.383823 56.464615 1937690 UPPER MIDDLE CLASS
In [35]:
plt.rcParams['figure.figsize'] = (10,8)
labels = 'BJP','INC','DMK','YSRCP','AITC','SHS','JD(U)','BSP','BJD','TRS','OTHER'
sizes = [300, 52, 23, 22, 22, 18, 16, 11, 11, 9,55]
 
colors=('orange', 'green', 'deeppink', 'blue', 'red', 'yellow', 'crimson', 'brown','darkorange','pink','gray')

my_circle = plt.Circle((0, 0), 0.7, color='white')

d = plt.pie(sizes, labels=labels, autopct='%0.0f%%',
            startangle=90,colors=colors, labeldistance=1.05)
plt.axis('equal')
plt.gca().add_artist(my_circle)
plt.title('OUT OF 539 SEATS WINNER DISTRIBUTION')            
plt.show()
In [36]:
df.head()
Out[36]:
STATE CONSTITUENCY NAME WINNER PARTY SYMBOL GENDER CRIMINAL CASES AGE CATEGORY EDUCATION ASSETS LIABILITIES GENERAL VOTES POSTAL VOTES TOTAL VOTES OVER TOTAL ELECTORS IN CONSTITUENCY OVER TOTAL VOTES POLLED IN CONSTITUENCY TOTAL ELECTORS STATUS
0 Telangana ADILABAD SOYAM BAPU RAO 1 BJP Lotus MALE HAS CASE 52.0 ST 12th Pass 3099414.0 Rs 2,31,450\n ~ 2 Lacs+ 376892 482 377374 25.330684 35.468248 1489790 MIDDLE CLASS
1 Telangana ADILABAD Godam Nagesh 0 TRS Car MALE NO CASE 54.0 ST Post Graduate 18477888.0 Rs 8,47,000\n ~ 8 Lacs+ 318665 149 318814 21.399929 29.964370 1489790 UPPER MIDDLE CLASS
2 Telangana ADILABAD RATHOD RAMESH 0 INC Hand MALE HAS CASE 52.0 ST 12th Pass 36491000.0 Rs 1,53,00,000\n ~ 1 Crore+ 314057 181 314238 21.092771 29.534285 1489790 UPPER MIDDLE CLASS
3 Telangana ADILABAD NOTA 0 NOTA NO SYMBOL NOT APPLICABLE NO CASE 0.0 NOT APPLICABLE NOT APPLICABLE 0.0 Rs 0 13030 6 13036 0.875023 1.225214 1489790 NO ASSETS
4 Uttar Pradesh AGRA Satyapal Singh Baghel 1 BJP Lotus MALE HAS CASE 58.0 SC Doctorate 74274036.0 Rs 86,06,522\n ~ 86 Lacs+ 644459 2416 646875 33.383823 56.464615 1937690 UPPER MIDDLE CLASS
In [37]:
p = df.groupby('STATE')['CONSTITUENCY'].nunique().reset_index()
plt.figure(figsize=(15,10))
sns.barplot(y='STATE',x='CONSTITUENCY',data=p,palette='spring')
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x2354f4b36a0>
In [38]:
p = df.groupby(['PARTY','EDUCATION','GENDER'])['WINNER'].sum().reset_index().sort_values('WINNER',ascending = False)
p = p[p['WINNER']!=0]
fig = px.bar(p, x='EDUCATION', y='WINNER',hover_data =['PARTY'], color='GENDER', height=650)
fig.show()
In [39]:
p = df.groupby(['PARTY','STATE']).sum().reset_index().sort_values('PARTY',ascending =True)
fig = px.bar(p, x='STATE', y='WINNER',hover_data =['PARTY'], color='PARTY', height=650)
fig.show()
In [40]:
#Creating Age_Group from Age column
AGE_GROUP = []
for i in df['AGE']:
    if i >= 24 and i <=35:
        AGE_GROUP.append('YOUNG AGE')
    elif i >= 36 and i<=60:
        AGE_GROUP.append('MIDDLE AGE')
    elif i >=60:
        AGE_GROUP.append('OLD AGE')
    else:
        AGE_GROUP.append('NOT KNOWN')
df['AGE_GROUP'] = AGE_GROUP
In [41]:
p = df.groupby(['PARTY','AGE_GROUP'])['WINNER'].sum().reset_index().sort_values('WINNER',ascending = False)
p = p[p['WINNER']!=0]
fig = px.bar(p, x='PARTY',y='WINNER',hover_data =['AGE_GROUP'], color='AGE_GROUP', height=650)
fig.show()
In [42]:
p = df.groupby(['PARTY','GENDER'])['WINNER'].count().reset_index().sort_values('WINNER',ascending = False)
#p = p[p['WINNER']!=0]
fig = px.bar(p, x='PARTY',y='WINNER',hover_data =['GENDER'], color='GENDER', height=700)
fig.show()
In [43]:
p = df.groupby(['PARTY','CRIMINAL\nCASES'])['WINNER'].sum().reset_index().sort_values('WINNER',ascending = False)
p = p[p['WINNER']!=0]
fig = px.bar(p, x='PARTY',y='WINNER',hover_data =['CRIMINAL\nCASES'], color='CRIMINAL\nCASES', height=750)
fig.show()
In [44]:
from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
df['STATE'] = labelEncoder_X.fit_transform(df['STATE'])
df['CONSTITUENCY'] = labelEncoder_X.fit_transform(df['CONSTITUENCY'])
df['NAME'] = labelEncoder_X.fit_transform(df['NAME'])
df['PARTY'] = labelEncoder_X.fit_transform(df['PARTY'])
df['SYMBOL'] = labelEncoder_X.fit_transform(df['SYMBOL'])
df['GENDER'] = labelEncoder_X.fit_transform(df['GENDER'])
df['CRIMINAL\nCASES'] = labelEncoder_X.fit_transform(df['CRIMINAL\nCASES'])
df['CATEGORY'] = labelEncoder_X.fit_transform(df['CATEGORY'])
df['EDUCATION'] = labelEncoder_X.fit_transform(df['EDUCATION'])
df['STATUS'] = labelEncoder_X.fit_transform(df['STATUS'])
df['AGE_GROUP'] = labelEncoder_X.fit_transform(df['AGE_GROUP'])
In [45]:
X=df.drop(['WINNER','ASSETS','LIABILITIES','GENERAL\nVOTES','POSTAL\nVOTES','AGE','OVER TOTAL ELECTORS \nIN CONSTITUENCY','OVER TOTAL VOTES POLLED \nIN CONSTITUENCY'],axis=1)
y=df['WINNER']
In [46]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
In [47]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)
In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score
In [49]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
0.9101620029455081
[[464  26]
 [ 35 154]]
C:\Users\Admin\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning:

Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.

In [50]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
print(accuracy_score(y_test,predictions ))
print(confusion_matrix(y_test,predictions ))
0.8615611192930781
[[448  42]
 [ 52 137]]
In [51]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
rfc__pred = rfc.predict(X_test)
print(accuracy_score(y_test,rfc__pred))
print(confusion_matrix(y_test,rfc__pred))
0.9116347569955817
[[472  18]
 [ 42 147]]
In [52]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
Using TensorFlow backend.
In [53]:
from keras.wrappers.scikit_learn import KerasClassifier

# Doesn't work?
# class TwoLayerFeedForward:
#     def __call__():
#         clf = Sequential()
#         clf.add(Dense(9, activation='relu', input_dim=3))
#         clf.add(Dense(9, activation='relu'))
#         clf.add(Dense(3, activation='softmax'))
#         clf.compile(loss='categorical_crossentropy', optimizer=SGD())
#         return clf

def twoLayerFeedForward():
    clf = Sequential()
    clf.add(Dense(9, activation='relu', input_dim=3))
    clf.add(Dense(9, activation='relu'))
    clf.add(Dense(3, activation='softmax'))
    clf.compile(loss='categorical_crossentropy', optimizer=SGD(), metrics=["accuracy"])
    return clf


# clf = KerasClassifier(TwoLayerFeedForward(), epochs=100, batch_size=500, verbose=0)
clf = KerasClassifier(twoLayerFeedForward, epochs=100, batch_size=500, verbose=0)
In [54]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
In [66]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
AdaBoostClassifier(n_estimators=100, random_state=0)
clf.predict(X_test)
clf.score(X_test, y_test)
Out[66]:
0.9131075110456554
In [62]:
from sklearn.ensemble import GradientBoostingClassifier
In [57]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)
Out[57]:
0.9086892488954345
In [58]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
In [60]:
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# baseline model
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
# evaluate model with standardized dataset
estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X_train, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
Baseline: 90.28% (1.61%)
In [ ]: